"""
About the Switchboard corpus

    This is conversational telephone speech collected as 2-channel, 8kHz-sampled
    data.  We are using just the Switchboard-1 Phase 1 training data.
    The catalog number LDC97S62 (Switchboard-1 Release 2) corresponds, we believe,
    to what we have.  We also use the Mississippi State transcriptions, which
    we download separately from
    http://www.isip.piconepress.com/projects/switchboard/releases/switchboard_word_alignments.tar.gz

    This data is not available for free - your institution needs to have an LDC subscription.
"""
import tarfile
from itertools import chain
from pathlib import Path
from typing import Dict, Optional, Union

from lhotse.audio import Recording, RecordingSet
from lhotse.supervision import SupervisionSegment, SupervisionSet
from lhotse.utils import Pathlike, check_and_rglob, urlretrieve_progress

SWBD_TEXT_URL = 'http://www.isip.piconepress.com/projects/switchboard/releases/switchboard_word_alignments.tar.gz'


def prepare_switchboard(
        audio_dir: Pathlike,
        transcripts_dir: Optional[Pathlike] = None,
        sentiment_dir: Optional[Pathlike] = None,
        output_dir: Optional[Pathlike] = None,
        omit_silence: bool = True,
        absolute_paths: bool = False
) -> Dict[str, Union[RecordingSet, SupervisionSet]]:
    """
    Prepare manifests for the Switchboard corpus.
    We create two manifests: one with recordings, and the other one with text supervisions.
    When ``sentiment_dir`` is provided, we create another supervision manifest with sentiment annotations.

    :param audio_dir: Path to ``LDC97S62`` package.
    :param transcripts_dir: Path to the transcripts directory (typically named "swb_ms98_transcriptions").
        If not provided, the transcripts will be downloaded.
    :param sentiment_dir: Optional path to ``LDC2020T14`` package which contains sentiment annotations
        for SWBD segments.
    :param output_dir: Directory where the manifests should be written. Can be omitted to avoid writing.
    :param omit_silence: Whether supervision segments with ``[silence]`` token should be removed or kept.
    :return: A dict with manifests. The keys are: ``{'recordings', 'supervisions'}``.
    """
    if transcripts_dir is None:
        transcripts_dir = download_and_untar()
    audio_paths = check_and_rglob(audio_dir, '*.sph')
    text_paths = check_and_rglob(transcripts_dir, '*trans.text')

    groups = []
    name_to_text = {p.stem.split('-')[0]: p for p in text_paths}
    for ap in audio_paths:
        name = ap.stem.replace('sw0', 'sw')
        groups.append({'audio': ap, 'text-0': name_to_text[f'{name}A'], 'text-1': name_to_text[f'{name}B']})

    recordings = RecordingSet.from_recordings(
        Recording.from_sphere(group['audio'], relative_path_depth=None if absolute_paths else 3)
        for group in groups
    )
    supervisions = SupervisionSet.from_segments(chain.from_iterable(
        make_segments(
            transcript_path=group[f'text-{channel}'],
            recording=recording,
            channel=channel,
            omit_silence=omit_silence
        )
        for group, recording in zip(groups, recordings)
        for channel in [0, 1]
    ))

    if sentiment_dir is not None:
        parse_and_add_sentiment_labels(sentiment_dir, supervisions)

    if output_dir is not None:
        output_dir = Path(output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)
        recordings.to_json(output_dir / 'recordings.json')
        supervisions.to_json(output_dir / 'supervisions.json')
    return {
        'recordings': recordings,
        'supervisions': supervisions
    }


def make_segments(transcript_path: Path, recording: Recording, channel: int, omit_silence: bool = True):
    lines = transcript_path.read_text().splitlines()
    return [
        SupervisionSegment(
            id=segment_id,
            recording_id=recording.id,
            start=float(start),
            duration=round(float(end) - float(start), ndigits=8),
            channel=channel,
            text=' '.join(words),
            language='English',
            speaker=f'{recording.id}A'
        )
        for segment_id, start, end, *words in map(str.split, lines)
        if words[0] != '[silence]' or not omit_silence
    ]


def download_and_untar(
        target_dir: Pathlike = '.',
        force_download: bool = False,
        url: str = SWBD_TEXT_URL
) -> Path:
    target_dir = Path(target_dir)
    transcript_dir = target_dir / 'swb_ms98_transcriptions'
    if transcript_dir.is_dir():
        return transcript_dir
    target_dir.mkdir(parents=True, exist_ok=True)
    tar_name = 'switchboard_word_alignments.tar.gz'
    tar_path = target_dir / tar_name
    if force_download or not tar_path.is_file():
        urlretrieve_progress(url, filename=tar_path, desc=f'Downloading {tar_name}')
    with tarfile.open(tar_path) as tar:
        tar.extractall(path=target_dir)
    return transcript_dir


def parse_and_add_sentiment_labels(sentiment_dir: Pathlike, supervisions: SupervisionSet):
    """Read 'LDC2020T14' sentiment annotations and add then to the supervision segments."""
    import pandas as pd
    # Sanity checks
    sentiment_dir = Path(sentiment_dir)
    labels = sentiment_dir / 'data' / 'sentiment_labels.tsv'
    assert sentiment_dir.is_dir() and labels.is_file()
    # Read the TSV as a dataframe
    df = pd.read_csv(labels, delimiter='\t', names=['id', 'start', 'end', 'sentiment'])
    # We are going to match the segments in LDC2020T14 with the ones we already
    # parsed from ISIP transcripts. We simply look which of the existing segments
    # fall into a sentiment-annotated time span. When doing it this way, we use
    # 51773 out of 52293 available sentiment annotations, which should be okay.
    for _, row in df.iterrows():
        call_id = row['id'].split('_')[0]
        matches = list(supervisions.find(
            recording_id=call_id,
            start_after=row['start'] - 1e-2,
            end_before=row['end'] + 1e-2,
        ))
        if not matches:
            continue
        labels = row['sentiment'].split('#')
        # SupervisionSegments returned from .find() are references to the ones in the
        # SupervisionSet, so we can just modify them. We use the "custom" field
        # to add the sentiment label. Since there were multiple annotators,
        # we add all available labels and leave it up to the user to disambiguate them.
        for segment in matches:
            segment.custom = {f'sentiment{i}': label for i, label in enumerate(labels)}
